package cn.lh.spider;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import cn.lh.pipeline.MysqlPipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

public class PolicySpider2 implements PageProcessor {
	
	private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
	public void process(Page page) {
		
		//http://www.liangjiang.gov.cn/Class/node_244.htm
		//http://www.liangjiang.gov.cn/Class/node_244_3.htm
		
		//http://www.liangjiang.gov.cn/Content/2012-04/20/content_4153.htm
		//http://www.liangjiang.gov.cn/Content/2015-12/17/content_225523.htm
		if(page.getUrl().regex("http://www\\.liangjiang\\.gov\\.cn/Class/node_[\\d_]*.htm").match()){
			//列表页
			page.addTargetRequests(page.getHtml().xpath("//div[@class='Articlelists platebox']/div[@class='list']")
					.links().regex("http://www\\.liangjiang\\.gov\\.cn/Content/[\\d-]*/\\d{1,2}/content[\\d_]*\\.htm").all());
			page.addTargetRequests(page.getHtml().xpath("//div[@class='Articlelists platebox']/div[@class='list']/div[@id='displaypagenum']/center")
					.links().regex("http://www\\.liangjiang\\.gov\\.cn/Class/node_[\\d_]*.htm").all());
		}else{
			//文章页
			//是否满足文章也结构
			//判断分页
			//添加分页的页面链接、并爬取
			//爬取标题、文章内容、页面url
			//添加开发区名称、内容分类、文章类型
			Selectable sc = page.getHtml().xpath("//div[@class='Article_content']");
			if(!sc.nodes().isEmpty()){
				Selectable sd = sc;
				Selectable img = sc.xpath("//img");
				Selectable p = sc.xpath("//p[contains(@style,'TEXT-ALIGN: center')]");
				Selectable a = sc.xpath("//a");
				Selectable center = sc.xpath("//center");
				//判断该页面是否分页，即center标签下是否包含“上一页、下一页”字符串
				//先进行分页判断，添加分页中待抓取的页面，然后删掉居中的标签
				boolean flag = false;
				//用来表示需要通过“\n”分离的字符串
				String[] fen = null;
				List<String> delList = new ArrayList<String>();
				if(!center.nodes().isEmpty()){
					Iterator<Selectable> iter = center.nodes().iterator();
					while(iter.hasNext()){
						String centerstr = iter.next().toString();
						if(centerstr.contains("上一页")||centerstr.contains("下一页")){
							flag = true;
						}
						fen = centerstr.split("\\n");
						for(int i=0;i<fen.length;i++){
							delList.add(fen[i].trim());
						}
					}
				}
				if(flag==false){
					page.putField("pagination", 0);
				}else{
					page.putField("pagination", 1);
					Selectable order = page.getUrl().regex("http://www\\.liangjiang\\.gov\\.cn/Content/[\\d-]*/\\d{1,2}/content_[\\d]*_(\\d*)\\.htm");
					
					if(order.nodes().size()==0){
						page.putField("order", "1");
					}else{
						page.putField("order", order.toString());
					}
					
					page.addTargetRequests(sc.xpath("//center//a").links()
							.regex("http://www\\.liangjiang\\.gov\\.cn/Content/[\\d-]*/\\d{1,2}/content[\\d_]*\\.htm").all());
				}
				if(page.getUrl().regex("http://www\\.liangjiang\\.gov\\.cn/Content/[\\d-]*/\\d{1,2}/content_[\\d]*_(\\d*)\\.htm").match()){
					String urlStr = page.getUrl().regex("(http://www\\.liangjiang\\.gov\\.cn/Content/[\\d-]*/\\d{1,2}/content_[\\d]*)_\\d*\\.htm").toString();
					page.putField("url", urlStr.concat(".htm"));
				}else{
					page.putField("url", page.getUrl());
				}
				Selectable timeSc = page.getHtml().xpath("//div[@class='time']");
				String time = null;
				if(!timeSc.nodes().isEmpty()){
					time = page.getHtml().xpath("//div[@class='time']/text()").toString().trim();
					String tmp = time.replaceFirst("\\d{4}/\\d{1,2}/\\d{1,2}", "delstr");
					String[] t = tmp.split("delstr");
					for(int i=0;i<t.length;i++){
						time = time.replaceAll(t[i], "");
					}
				}
				page.putField("release_time", time);
				String title = page.getHtml().xpath("//div[@class='Article_title']/text()").toString().trim();
				Selectable titlep =page.getHtml().xpath("//div[@class='Article_title']//p");
				if(!titlep.nodes().isEmpty()){
					String titlestr = titlep.toString().trim();
					title=title.replace(titlestr, "");
				}
				page.putField("title", title.trim());
				page.putField("type", "文章");
				page.putField("developName", "两江新区");
				page.putField("contentCategory", "投资动态");
				//应该由大到小加，删的时候由大到小删除，反之会因为小的先匹配删除掉，大的不匹配的情况
				if(!p.nodes().isEmpty()){
					Iterator<Selectable> iter = p.nodes().iterator();
					while(iter.hasNext()){
						fen = iter.next().toString().split("\\n");
						for(int i=0;i<fen.length;i++){
							delList.add(fen[i].trim());
						}
					}
				}
				if(!a.nodes().isEmpty()){
					Iterator<Selectable> iter = a.nodes().iterator();
					while(iter.hasNext()){
						fen = iter.next().toString().split("\\n");
						for(int i=0;i<fen.length;i++){
							delList.add(fen[i].trim());
						}
					}
				}
				if(!img.nodes().isEmpty()){
					Iterator<Selectable> iter = img.nodes().iterator();
					while(iter.hasNext()){
						fen = iter.next().toString().split("\\n");
						for(int i=0;i<fen.length;i++){
							delList.add(fen[i].trim());
						}
					}
				}
				
				Iterator<String> delIter =  delList.iterator();
				String delstr = null;
				while(delIter.hasNext()){
					delstr = delIter.next();
					sd = sd.replace(delstr, "");
				}
				sd = sd.replace("\\n", "");
				page.putField("content", sd);
				//System.out.println(sd);
			}
		}
		
		
	}

	public Site getSite() {
		return site;
	}
	public static void main(String[] args) {
		Spider.create(new PolicySpider2())
		.addUrl("http://www.liangjiang.gov.cn/Class/node_156.htm")
		//.addUrl("http://www.liangjiang.gov.cn/Content/2012-05/11/content_4158_2.htm")
		//.addUrl("http://www.liangjiang.gov.cn/Content/2015-12/17/content_225523.htm")
        .addPipeline(new MysqlPipeline())
		//.addPipeline(new FilePipeline("d://a"))
        .thread(5)
        .run();
	}
}
